This document contains time series analysis for the DSAN5100 final project.
Load data files and preprocess
# Load required libraries for Rlibrary(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.1
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)library(arrow)
Attaching package: 'arrow'
The following object is masked from 'package:lubridate':
duration
The following object is masked from 'package:utils':
timestamp
library(plotly)
Attaching package: 'plotly'
The following object is masked from 'package:arrow':
schema
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
library(zoo)
Attaching package: 'zoo'
The following objects are masked from 'package:base':
as.Date, as.Date.numeric
library(forecast)
Registered S3 method overwritten by 'quantmod':
method from
as.zoo.data.frame zoo
# Set Arrow option to skip null charactersoptions(arrow.skip_nul =TRUE)# Set seed for reproducibilityset.seed(42)
# A tibble: 6 × 16
ID publishedAt `source-name` location_code location category year month
<int> <chr> <chr> <chr> <chr> <chr> <int> <int>
1 816 2020-08-07T11… Minneapolis … us United … general 2020 8
2 901 2020-08-07T11… Google News us United … general 2020 8
3 961 2020-08-07T11… CNBC us United … general 2020 8
4 1054 2020-08-07T12… Google News us United … general 2020 8
5 1147 2020-08-07T12… CNET us United … general 2020 8
6 1335 2020-08-07T13… The Washingt… us United … general 2020 8
# ℹ 8 more variables: new_title <chr>, neg <dbl>, neu <dbl>, pos <dbl>,
# compound <dbl>, sentiment_category <chr>, bias_category <chr>,
# bias_score <dbl>
# Change publishedAt column into a date type# Function to convert publishedAt to date typeconvert_date_column <-function(df) { df <- df %>%mutate(publishedAt =as.Date(publishedAt))return(df)}# Apply date conversion to all dataframesargentina_df <-convert_date_column(argentina_df)canada_df <-convert_date_column(canada_df)china_df <-convert_date_column(china_df)india_df <-convert_date_column(india_df)italy_df <-convert_date_column(italy_df)russia_df <-convert_date_column(russia_df)us_df <-convert_date_column(us_df)# Check the data type conversioncat("Date type conversion completed:\n")
Date type conversion completed:
cat("publishedAt column type for US data:", class(us_df$publishedAt), "\n")
publishedAt column type for US data: Date
# Check date range for various country datacat("Date range for US data:", as.character(range(us_df$publishedAt, na.rm =TRUE)), "\n")
Date range for US data: 2015-04-01 2021-11-29
cat("Date range for Argentina data:", as.character(range(argentina_df$publishedAt, na.rm =TRUE)), "\n")
Date range for Argentina data: 2016-10-02 2021-11-29
cat("Date range for Canada data:", as.character(range(canada_df$publishedAt, na.rm =TRUE)), "\n")
Date range for Canada data: 2013-12-26 2021-11-29
cat("Date range for China data:", as.character(range(china_df$publishedAt, na.rm =TRUE)), "\n")
Date range for China data: 2019-11-23 2021-11-29
cat("Date range for India data:", as.character(range(india_df$publishedAt, na.rm =TRUE)), "\n")
Date range for India data: 2015-04-25 2021-11-29
cat("Date range for Italy data:", as.character(range(italy_df$publishedAt, na.rm =TRUE)), "\n")
Date range for Italy data: 2016-07-10 2021-11-29
cat("Date range for Russia data:", as.character(range(russia_df$publishedAt, na.rm =TRUE)), "\n")
Date range for Russia data: 2010-11-24 2021-11-29
All of the countries have an end date of 2021-11-29 after which the data was no longer collected. The earliest data for the published articles range from 2010-11-24 for Russia to 2019-11-23 for China
# Combine the country datasets into df_countries# Set Arrow option to handle null charactersoptions(arrow.skip_nul =TRUE)df_countries <-bind_rows( us_df, canada_df, china_df, india_df, italy_df, russia_df, argentina_df)
Warning in vec_rbind(!!!dots, .names_to = .id, .error_call = current_env()):
Stripping '\0' (nul) from character vector
Warning in vec_rbind(!!!dots, .names_to = .id, .error_call = current_env()):
Stripping '\0' (nul) from character vector
Warning in vec_rbind(!!!dots, .names_to = .id, .error_call = current_env()):
Stripping '\0' (nul) from character vector
# Calculate percentage of articles in each sentiment category (positive, neutral, negative)sentiment_distribution <- df_countries %>%mutate(sentiment_category =case_when( compound >=0.05~"Positive", compound <=-0.05~"Negative", TRUE~"Neutral" ) ) %>%group_by(category, sentiment_category) %>%summarise(count =n(), .groups ='drop') %>%group_by(category) %>%mutate(total =sum(count),percentage =round(count / total *100, 1) ) %>%select(category, sentiment_category, count, percentage) %>%arrange(category, desc(percentage))print("\nSentiment Distribution by Category (%):")
[1] "\nSentiment Distribution by Category (%):"
print(sentiment_distribution)
# A tibble: 21 × 4
# Groups: category [7]
category sentiment_category count percentage
<chr> <chr> <int> <dbl>
1 business Neutral 65603 44.3
2 business Positive 49010 33.1
3 business Negative 33617 22.7
4 entertainment Positive 63147 37.8
5 entertainment Neutral 57490 34.4
6 entertainment Negative 46544 27.8
7 general Neutral 27439 40.3
8 general Negative 21028 30.9
9 general Positive 19630 28.8
10 health Neutral 36048 42.2
# ℹ 11 more rows
# View distribution of sentiment by categorydf_countries %>%ggplot(aes(x = category, y = compound, fill = category)) +geom_boxplot() +theme(axis.text.x =element_text(angle =45, hjust =1)) +labs(title ="Sentiment by Category",x ="Category",y ="Compound Sentiment Score") +theme_minimal() +guides(fill ="none")
The average sentiment across all categories is neutral (~0.0).
NOTE: Typical threshold values (used in literature cited on the vaderSentiment documentation) are: - Positive sentiment: compound score >= 0.05 - Neutral sentiment: compound score > -0.05 and compound score < 0.05 - Negative sentiment: compound score <= -0.05
Sports, Technology, Entertainment have an average positive sentiment (0.05) while the rest of the categories are neutral. These three categories general induce excitement in society hence why they lean on the positive side.
Science has the smallest interquartile range, possibly due to how science news tends to be more objective and less emotionally loaded.
On the other hand, entertainment has the largest interquartile range as the news contains the widest emotional swings and the most sensationalized, highly varied content of any category, from celebrity scandals to movie releases
General Category
# Average sentiment (compound score) time series by country in the general category# Transform dataframe to only include general category, countries as columns, data as row index, values as average sentiment scoregeneral_df <- df_countries_2020 %>%filter(category =="general") %>%mutate(year_month =floor_date(publishedAt, "month")) %>%group_by(location, year_month) %>%summarise(avg_compound =mean(compound, na.rm =TRUE),n_articles =n(),.groups ='drop' ) %>%select(location, year_month, avg_compound) %>%pivot_wider(names_from = location, values_from = avg_compound ) %>%arrange(year_month) %>%rename(date = year_month)
# Visualize combined time series plotsgeneral_plot <-ggplot(general_df, aes(x = date)) +geom_line(aes(y = Argentina, color ="Argentina"), size =1, na.rm =TRUE) +geom_line(aes(y = Canada, color ="Canada"), size =1, na.rm =TRUE) +geom_line(aes(y = China, color ="China"), size =1, na.rm =TRUE) +geom_line(aes(y = India, color ="India"), size =1, na.rm =TRUE) +geom_line(aes(y = Italy, color ="Italy"), size =1, na.rm =TRUE) +geom_line(aes(y =`Russian Federation`, color ="Russia"), size =1, na.rm =TRUE) +geom_line(aes(y =`United States`, color ="United States"), size =1, na.rm =TRUE) +labs(title ="Average General Sentiment for the Countries",subtitle ="From 2020 - 2021 (Full timeline shown for all countries)",x ="Date",y ="Average sentiment score (-1 to 1)",color ="Countries" ) +theme_minimal() +theme(legend.position ="bottom",axis.text.x =element_text(angle =45, hjust =1) ) +scale_x_date(limits =c(as.Date("2020-01-01"), as.Date("2021-12-31")),date_breaks ="6 months", date_labels ="%Y-%m" )
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
ggplotly(general_plot) %>%layout(hovermode ="x")
In the general news category, most countries have an average sentiment score of neutral (-0.05 < compound score < 0.05) except for the USA and China. China sentiment score is significant more positive (>0.05) while the USA sentiment score leans into negative across time (<0.05)
The observation about China might be due to state controlled media resulting in positive news most of the time. The observation about the US might be due to how polarized the US is leading to constant feed of slightly negative news in general.
Except for China, the time series plots for the rest of the countries tend to skew towards negative. Research shows that news tends to skew negative due to negativity bias where humans naturally pay more attention to negative information
Business category
# Average sentiment (compound score) time series by country in the business category# Transform dataframe to only include business category, countries as columns, data as row index, values as average sentiment scorebusiness_df <- df_countries_2020 %>%filter(category =="business") %>%mutate(year_month =floor_date(publishedAt, "month")) %>%group_by(location, year_month) %>%summarise(avg_compound =mean(compound, na.rm =TRUE),n_articles =n(),.groups ='drop' ) %>%select(location, year_month, avg_compound) %>%pivot_wider(names_from = location, values_from = avg_compound ) %>%arrange(year_month) %>%rename(date = year_month)head(business_df)
# A tibble: 6 × 8
date Argentina Canada China India Italy `Russian Federation`
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2020-02-01 0 NA NA NA NA NA
2 2020-03-01 NA NA NA NA NA -0.382
3 2020-07-01 0.128 NA NA NA NA -0.326
4 2020-08-01 0.0505 0.00298 0.125 0.0838 0.0741 0.0122
5 2020-09-01 0.0263 0.00238 0.0860 0.0648 0.0665 0.0168
6 2020-10-01 0.0310 0.0123 0.0885 0.120 0.0616 0.0172
# ℹ 1 more variable: `United States` <dbl>
# Visualize combined time series plotsbusiness_plot <-ggplot(business_df, aes(x = date)) +geom_line(aes(y = Argentina, color ="Argentina"), size =1, na.rm =TRUE) +geom_line(aes(y = Canada, color ="Canada"), size =1, na.rm =TRUE) +geom_line(aes(y = China, color ="China"), size =1, na.rm =TRUE) +geom_line(aes(y = India, color ="India"), size =1, na.rm =TRUE) +geom_line(aes(y = Italy, color ="Italy"), size =1, na.rm =TRUE) +geom_line(aes(y =`Russian Federation`, color ="Russia"), size =1, na.rm =TRUE) +geom_line(aes(y =`United States`, color ="United States"), size =1, na.rm =TRUE) +labs(title ="Average Business Sentiment for the Countries",subtitle ="From 2020 - 2021",x ="Date",y ="Average sentiment score (-1 to 1)",color ="Countries" ) +theme_minimal() +theme(legend.position ="bottom",axis.text.x =element_text(angle =45, hjust =1) )ggplotly(business_plot) %>%layout(hovermode ="x")
The time series for business shows countries having neutral to positive sentiment on average across time. There are slight seasonal observations due to business cycles: - New Year dips possible due to slowed down economic activity during the December holidays and negative year-end reporting/fewer positive stories - A rise by March as companies announce new strategies and growth forecasts, including governments publishing new-year economic outlooks and investors returning to the market after holidays - Slight mid year decline as many countries experience an economic mid-year slowdown and there are possible downward revisions
Health
# Average sentiment (compound score) time series by country in the health category# Transform dataframe to only include health category, countries as columns, data as row index, values as average sentiment scorehealth_df <- df_countries_2020 %>%filter(category =="health") %>%mutate(year_month =floor_date(publishedAt, "month")) %>%group_by(location, year_month) %>%summarise(avg_compound =mean(compound, na.rm =TRUE),n_articles =n(),.groups ='drop' ) %>%select(location, year_month, avg_compound) %>%pivot_wider(names_from = location, values_from = avg_compound ) %>%arrange(year_month) %>%rename(date = year_month)head(health_df)
# A tibble: 6 × 8
date Argentina Canada China India Italy `Russian Federation`
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2020-01-01 NA NA NA NA NA -0.382
2 2020-02-01 NA NA NA NA NA NA
3 2020-03-01 NA -0.389 NA NA 0.420 NA
4 2020-04-01 NA NA NA NA NA NA
5 2020-05-01 NA NA NA NA NA NA
6 2020-07-01 0.153 NA NA NA 0 NA
# ℹ 1 more variable: `United States` <dbl>
# Visualize combined time series plotshealth_plot <-ggplot(health_df, aes(x = date)) +geom_line(aes(y = Argentina, color ="Argentina"), size =1, na.rm =TRUE) +geom_line(aes(y = Canada, color ="Canada"), size =1, na.rm =TRUE) +geom_line(aes(y = China, color ="China"), size =1, na.rm =TRUE) +geom_line(aes(y = India, color ="India"), size =1, na.rm =TRUE) +geom_line(aes(y = Italy, color ="Italy"), size =1, na.rm =TRUE) +geom_line(aes(y =`Russian Federation`, color ="Russia"), size =1, na.rm =TRUE) +geom_line(aes(y =`United States`, color ="United States"), size =1, na.rm =TRUE) +labs(title ="Average Health Sentiment for the Countries",subtitle ="From 2020 - 2021",x ="Date",y ="Average sentiment score (-1 to 1)",color ="Countries" ) +theme_minimal() +theme(legend.position ="bottom",axis.text.x =element_text(angle =45, hjust =1) )ggplotly(health_plot) %>%layout(hovermode ="x")
In the health category, there is a huge spike around July 2020 as multiple vaccine candidates for early COVID-19 vaccine breakthroughs are announced. The coverage also shifted from panic to progress and solutions. After that the sentiment drops to low levels reaching negative as the second and third COVID waves hit globally. Long COVID, healthcare shortages, and pandemic fatigue keep the sentiment low till late 2020 especially in the USA and China. Argentina had a positive sentiment category for most of the period. This might have been due to Argentina’s news coverage being unusually government-optimistic by design and positivity inflation of the spanish language sentiment lexicons by the Vader packages
Technology
# Average sentiment (compound score) time series by country in the technology category# Transform dataframe to only include technology category, countries as columns, data as row index, values as average sentiment scoretechnology_df <- df_countries_2020 %>%filter(category =="technology") %>%mutate(year_month =floor_date(publishedAt, "month")) %>%group_by(location, year_month) %>%summarise(avg_compound =mean(compound, na.rm =TRUE),n_articles =n(),.groups ='drop' ) %>%select(location, year_month, avg_compound) %>%pivot_wider(names_from = location, values_from = avg_compound ) %>%arrange(year_month) %>%rename(date = year_month)head(technology_df)
# Visualize combined time series plotstechnology_plot <-ggplot(technology_df, aes(x = date)) +geom_line(aes(y = Argentina, color ="Argentina"), size =1, na.rm =TRUE) +geom_line(aes(y = Canada, color ="Canada"), size =1, na.rm =TRUE) +geom_line(aes(y = China, color ="China"), size =1, na.rm =TRUE) +geom_line(aes(y = India, color ="India"), size =1, na.rm =TRUE) +geom_line(aes(y = Italy, color ="Italy"), size =1, na.rm =TRUE) +geom_line(aes(y =`Russian Federation`, color ="Russia"), size =1, na.rm =TRUE) +geom_line(aes(y =`United States`, color ="United States"), size =1, na.rm =TRUE) +labs(title ="Average Technology Sentiment for the Countries",subtitle ="From 2020 - 2021",x ="Date",y ="Average sentiment score (-1 to 1)",color ="Countries" ) +theme_minimal() +theme(legend.position ="bottom",axis.text.x =element_text(angle =45, hjust =1) )ggplotly(technology_plot) %>%layout(hovermode ="x")
The sentiment around tech articles appears to follow increased sentiment during the fall (Sept - Nov) during the major product launch season before a slight drop at the beginning of the year possible due to year in review critical pieces, layoffs, and new regulations. There is an increase in sentiment during the summer (June - Aug) due to developer conferences and positive innovation headlines
Entertainment
# Average sentiment (compound score) time series by country in the entertainment category# Transform dataframe to only include entertainment category, countries as columns, data as row index, values as average sentiment scoreentertainment_df <- df_countries_2020 %>%filter(category =="entertainment") %>%mutate(year_month =floor_date(publishedAt, "month")) %>%group_by(location, year_month) %>%summarise(avg_compound =mean(compound, na.rm =TRUE),n_articles =n(),.groups ='drop' ) %>%select(location, year_month, avg_compound) %>%pivot_wider(names_from = location, values_from = avg_compound ) %>%arrange(year_month) %>%rename(date = year_month)head(entertainment_df)
# A tibble: 6 × 8
date Argentina Canada China India Italy `Russian Federation`
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2020-01-01 NA NA NA NA NA -0.210
2 2020-02-01 NA NA NA NA NA 0.133
3 2020-03-01 NA NA NA NA NA 0.496
4 2020-04-01 NA NA NA NA NA -0.337
5 2020-05-01 NA NA NA NA NA 0.273
6 2020-06-01 NA NA NA NA NA -0.417
# ℹ 1 more variable: `United States` <dbl>
# Visualize combined time series plotsentertainment_plot <-ggplot(entertainment_df, aes(x = date)) +geom_line(aes(y = Argentina, color ="Argentina"), size =1, na.rm =TRUE) +geom_line(aes(y = Canada, color ="Canada"), size =1, na.rm =TRUE) +geom_line(aes(y = China, color ="China"), size =1, na.rm =TRUE) +geom_line(aes(y = India, color ="India"), size =1, na.rm =TRUE) +geom_line(aes(y = Italy, color ="Italy"), size =1, na.rm =TRUE) +geom_line(aes(y =`Russian Federation`, color ="Russia"), size =1, na.rm =TRUE) +geom_line(aes(y =`United States`, color ="United States"), size =1, na.rm =TRUE) +labs(title ="Average Entertainment Sentiment for the Countries",subtitle ="From 2020 - 2021",x ="Date",y ="Average sentiment score (-1 to 1)",color ="Countries" ) +theme_minimal() +theme(legend.position ="bottom",axis.text.x =element_text(angle =45, hjust =1) )ggplotly(entertainment_plot) %>%layout(hovermode ="x")
China and India have a positive sentiment in entertainment while the other countries have a moving average around neutral. This pattern might reflect cultural and structural differences in media reporting: both countries emphasize celebratory and promotional coverage of films, music, and celebrities.
Sports
# Average sentiment (compound score) time series by country in the sports category# Transform dataframe to only include sports category, countries as columns, data as row index, values as average sentiment scoresports_df <- df_countries_2020 %>%filter(category =="sports") %>%mutate(year_month =floor_date(publishedAt, "month")) %>%group_by(location, year_month) %>%summarise(avg_compound =mean(compound, na.rm =TRUE),n_articles =n(),.groups ='drop' ) %>%select(location, year_month, avg_compound) %>%pivot_wider(names_from = location, values_from = avg_compound ) %>%arrange(year_month) %>%rename(date = year_month)head(sports_df)
# A tibble: 6 × 8
date Argentina Canada China India Italy `Russian Federation`
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2020-03-01 NA NA NA 0 NA NA
2 2020-05-01 NA 0 NA NA NA NA
3 2020-08-01 0.0430 0.0701 0.189 0.0998 0.0805 0.106
4 2020-09-01 0.0483 0.0913 0.165 0.0928 0.0691 0.0951
5 2020-10-01 0.0434 0.0887 0.189 0.0992 0.0484 0.0853
6 2020-11-01 0.00532 0.0740 0.157 0.0999 0.0285 0.0937
# ℹ 1 more variable: `United States` <dbl>
# Visualize combined time series plotssports_plot <-ggplot(sports_df, aes(x = date)) +geom_line(aes(y = Argentina, color ="Argentina"), size =1, na.rm =TRUE) +geom_line(aes(y = Canada, color ="Canada"), size =1, na.rm =TRUE) +geom_line(aes(y = China, color ="China"), size =1, na.rm =TRUE) +geom_line(aes(y = India, color ="India"), size =1, na.rm =TRUE) +geom_line(aes(y = Italy, color ="Italy"), size =1, na.rm =TRUE) +geom_line(aes(y =`Russian Federation`, color ="Russia"), size =1, na.rm =TRUE) +geom_line(aes(y =`United States`, color ="United States"), size =1, na.rm =TRUE) +labs(title ="Average Sports Sentiment for the Countries",subtitle ="From 2020 - 2021",x ="Date",y ="Average sentiment score (-1 to 1)",color ="Countries" ) +theme_minimal() +theme(legend.position ="bottom",axis.text.x =element_text(angle =45, hjust =1) )ggplotly(sports_plot) %>%layout(hovermode ="x")
In sports, most countries have a positive sentiment, with China at a significantly higher moving average from other countries possible due to cultural differences. The spike in 2021-05 corresponds to the end of European football while the spike in 2021-09 corresponds to the start of many sports leagues (MLB, NFL, EPL)
Science
# Average sentiment (compound score) time series by country in the science category# Transform dataframe to only include science category, countries as columns, data as row index, values as average sentiment scorescience_df <- df_countries_2020 %>%filter(category =="science") %>%mutate(year_month =floor_date(publishedAt, "month")) %>%group_by(location, year_month) %>%summarise(avg_compound =mean(compound, na.rm =TRUE),n_articles =n(),.groups ='drop' ) %>%select(location, year_month, avg_compound) %>%pivot_wider(names_from = location, values_from = avg_compound ) %>%arrange(year_month) %>%rename(date = year_month)head(science_df)
# A tibble: 6 × 8
date Argentina Canada China India Italy `Russian Federation`
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2020-01-01 NA 0.159 NA 0.0306 NA NA
2 2020-02-01 NA 0.106 NA 0.0254 0 NA
3 2020-03-01 NA -0.118 0 0.318 0 NA
4 2020-04-01 NA -0.0250 NA 0.138 NA NA
5 2020-05-01 NA -0.023 NA 0.181 NA NA
6 2020-06-01 NA 0.0135 NA -0.0163 NA 0
# ℹ 1 more variable: `United States` <dbl>
# Visualize combined time series plotsscience_plot <-ggplot(science_df, aes(x = date)) +geom_line(aes(y = Argentina, color ="Argentina"), size =1, na.rm =TRUE) +geom_line(aes(y = Canada, color ="Canada"), size =1, na.rm =TRUE) +geom_line(aes(y = China, color ="China"), size =1, na.rm =TRUE) +geom_line(aes(y = India, color ="India"), size =1, na.rm =TRUE) +geom_line(aes(y = Italy, color ="Italy"), size =1, na.rm =TRUE) +geom_line(aes(y =`Russian Federation`, color ="Russia"), size =1, na.rm =TRUE) +geom_line(aes(y =`United States`, color ="United States"), size =1, na.rm =TRUE) +labs(title ="Average Science Sentiment for the Countries",subtitle ="From 2020 - 2021",x ="Date",y ="Average sentiment score (-1 to 1)",color ="Countries" ) +theme_minimal() +theme(legend.position ="bottom",axis.text.x =element_text(angle =45, hjust =1) )ggplotly(science_plot) %>%layout(hovermode ="x")
Science sentiment shows a neutral moving average across time with a spike in 2020-07 due to early announcements of the COVID-19 breakthrough
Analysis of sentiment and bias in USA news
Sentiment Analysis
# Average sentiment by category# Transform df_country to only include USA, then date, categories as columns, and average sentiment as valuesusa_df <- df_countries %>%filter(location =="United States") %>%mutate(year_month =floor_date(publishedAt, "month")) %>%group_by(category, year_month) %>%summarise(avg_compound =mean(compound, na.rm =TRUE),n_articles =n(),.groups ='drop' ) %>%select(category, year_month, avg_compound) %>%pivot_wider(names_from = category, values_from = avg_compound ) %>%arrange(year_month) %>%rename(date = year_month)head(usa_df)
# A tibble: 6 × 8
date business entertainment general health science sports technology
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2015-04-01 NA NA NA 0.637 NA NA NA
2 2015-05-01 NA NA NA NA 0 NA NA
3 2015-10-01 NA NA NA NA 0 NA NA
4 2015-11-01 NA NA NA NA -0.459 NA NA
5 2016-05-01 NA NA NA NA 0 NA NA
6 2016-06-01 NA NA NA 0 NA NA NA
# Time series plot for average sentiment by categoryusa_category_plot <-ggplot(usa_df, aes(x = date)) +geom_line(aes(y = general, color ="General"), size =1, na.rm =TRUE) +geom_line(aes(y = business, color ="Business"), size =1, na.rm =TRUE) +geom_line(aes(y = health, color ="Health"), size =1, na.rm =TRUE) +geom_line(aes(y = technology, color ="Technology"), size =1, na.rm =TRUE) +geom_line(aes(y = entertainment, color ="Entertainment"), size =1, na.rm =TRUE) +geom_line(aes(y = sports, color ="Sports"), size =1, na.rm =TRUE) +geom_line(aes(y = science, color ="Science"), size =1, na.rm =TRUE) +labs(title ="Average Sentiment by Category for USA News",subtitle ="From 2020 - 2021",x ="Date",y ="Average sentiment score (-1 to 1)",color ="Category" ) +theme_minimal() +theme(legend.position ="bottom",axis.text.x =element_text(angle =45, hjust =1) ) +scale_x_date(limits =c(as.Date("2020-01-01"), as.Date("2021-12-31")),date_breaks ="6 months", date_labels ="%Y-%m" )ggplotly(usa_category_plot) %>%layout(hovermode ="x")
# Compare average sentiment across categories (boxplot)df_countries %>%filter(location =="United States") %>%ggplot(aes(x = category, y = compound, fill = category)) +geom_boxplot() +theme(axis.text.x =element_text(angle =45, hjust =1)) +labs(title ="Sentiment Distribution by Category for USA News",subtitle ="From 2020 - 2021",x ="Category",y ="Compound Sentiment Score") +theme_minimal() +guides(fill ="none")
In general, science and sports have a higher moving average of sentiment across time due to the objective nature of science articles and exciting nature of entertainment headlines while general and health had the lowest sentiment due to the COVID-19 pandemic
# Average bias by category# Transform df_country to only include USA, then date, categories as columns, and average bias as valuesusa_bias_df <- df_countries %>%filter(location =="United States") %>%mutate(year_month =floor_date(publishedAt, "month")) %>%group_by(category, year_month) %>%summarise(avg_bias =mean(bias_score, na.rm =TRUE),n_articles =n(),.groups ='drop' ) %>%select(category, year_month, avg_bias) %>%pivot_wider(names_from = category, values_from = avg_bias ) %>%arrange(year_month) %>%rename(date = year_month)head(usa_bias_df)
# A tibble: 6 × 8
date business entertainment general health science sports technology
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2015-04-01 NA NA NA 0.973 NA NA NA
2 2015-05-01 NA NA NA NA -0.522 NA NA
3 2015-10-01 NA NA NA NA 0.555 NA NA
4 2015-11-01 NA NA NA NA -0.659 NA NA
5 2016-05-01 NA NA NA NA -0.894 NA NA
6 2016-06-01 NA NA NA 0.615 NA NA NA
# Time series plot for average bias by categoryusa_bias_plot <-ggplot(usa_bias_df, aes(x = date)) +geom_line(aes(y = general, color ="General"), size =1, na.rm =TRUE) +geom_line(aes(y = business, color ="Business"), size =1, na.rm =TRUE) +geom_line(aes(y = health, color ="Health"), size =1, na.rm =TRUE) +geom_line(aes(y = technology, color ="Technology"), size =1, na.rm =TRUE) +geom_line(aes(y = entertainment, color ="Entertainment"), size =1, na.rm =TRUE) +geom_line(aes(y = sports, color ="Sports"), size =1, na.rm =TRUE) +geom_line(aes(y = science, color ="Science"), size =1, na.rm =TRUE) +labs(title ="Average Bias by Category for USA News",subtitle ="From 2020 - 2021",x ="Date",y ="Average bias score (0 to 1)",color ="Category" ) +theme_minimal() +theme(legend.position ="bottom",axis.text.x =element_text(angle =45, hjust =1) ) +scale_x_date(limits =c(as.Date("2020-01-01"), as.Date("2021-12-31")),date_breaks ="6 months", date_labels ="%Y-%m" )ggplotly(usa_bias_plot) %>%layout(hovermode ="x")
# Compare average bias across categories (boxplot)df_countries %>%filter(location =="United States") %>%ggplot(aes(x = category, y = bias_score, fill = category)) +geom_boxplot() +theme(axis.text.x =element_text(angle =45, hjust =1)) +labs(title ="Bias Distribution by Category for USA News",subtitle ="From 2020 - 2021",x ="Category",y ="Bias Score") +theme_minimal() +guides(fill ="none")
Analysis Health, General, Science categories combined
In to capture the sentiment around COVID-19 and also leverage the scarse related data in the dataset, we have combined the Health, Science, and General category to explore the sentiment and bias during the COVID-19 pandemic (2020-02 to 2021-11)
# Combined time series plot for sentiment and bias across health, general, and science categories, daily# Prepare daily combined data for USA with both sentiment and bias for health, general, and science categoriesusa_combined_daily <- df_countries %>%filter(location =="United States", category %in%c("health", "general", "science", "business")) %>%group_by(publishedAt) %>%summarise(avg_sentiment =mean(compound, na.rm =TRUE),avg_bias =mean(bias_score, na.rm =TRUE),n_articles =n(),.groups ='drop' ) %>%rename(date = publishedAt) %>%# Transform to long format for plottingpivot_longer(cols =c(avg_sentiment, avg_bias),names_to ="metric",values_to ="score" ) %>%mutate(metric =case_when( metric =="avg_sentiment"~"Sentiment", metric =="avg_bias"~"Bias" ) )head(usa_combined_daily)
# Create combined plot with dual y-axes# Create the base plot with sentiment and bias for combined health, general, and science categoriesusa_combined_plot <-ggplot(usa_combined_daily, aes(x = date, y = score, color = metric)) +geom_line(size =0.5, alpha =0.7) +facet_wrap(~ metric, scales ="free_y", ncol =1) +labs(title ="Daily Sentiment and Bias Trends for USA Health, General & Science News",subtitle ="From 2020 - 2021 (Combined Categories Analysis)",x ="Date",y ="Score",color ="Metric" ) +theme_minimal() +theme(legend.position ="bottom",axis.text.x =element_text(angle =45, hjust =1),strip.text =element_text(face ="bold") ) +scale_x_date(,date_breaks ="3 months", date_labels ="%Y-%m" ) +scale_color_manual(values =c("Sentiment"="#2E86AB", "Bias"="#A23B72"))ggplotly(usa_combined_plot) %>%layout(hovermode ="x")
Before 2020, the time series shows random bias and sentiment distribution
After the first confirmed U.S. COVID-19 case, news sentiment started neutral as initial reporting was factual. A sharp negative drop in May 2020 coincided with rapidly rising cases and crises in hospitals. Optimism returned briefly in July 2020 due to early vaccine trial progress and partial reopening measures. Sentiment then stabilized around neutral with occasional negative dips, including during the December 2020 holiday surge. In 2021, coverage remained neutral to slightly negative, but July 2021 saw the lowest sentiment of the year, corresponding to the Delta variant surge and heightened public health concerns
Sentiment during Christmas and New Year 2020–2021 was particularly low due to a combination of factors: a surge in COVID-19 cases following holiday gatherings, public health warnings discouraging travel and celebrations, widespread pandemic fatigue, and uncertainty surrounding the early vaccine rollout. Media coverage during this period was dominated by negative or cautionary language, leading to a pronounced dip in sentiment despite the holiday season.
# Decomposition of combined sentiment time series in the USA# Prepare monthly aggregated sentiment data for USA (health, general, science combined)usa_combined_monthly <- df_countries %>%filter(location =="United States", category %in%c("health", "general", "science")) %>%mutate(year_month =floor_date(publishedAt, "month")) %>%group_by(year_month) %>%summarise(avg_sentiment =mean(compound, na.rm =TRUE), .groups ='drop') %>%arrange(year_month)# Create time series object with monthly frequencycombined_ts <-ts(usa_combined_monthly$avg_sentiment, start =c(2018, 1), frequency =12)# Decompose (additive by default)dec <-decompose(combined_ts)
# Time indext <-time(combined_ts)# Make four simple plotsp1 <-plot_ly(x =~t, y =~as.numeric(dec$x), type ="scatter", mode ="lines", name ="Observed", showlegend =FALSE)p2 <-plot_ly(x =~t, y =~as.numeric(dec$trend), type ="scatter", mode ="lines", name ="Trend", showlegend =FALSE)p3 <-plot_ly(x =~t, y =~as.numeric(dec$seasonal), type ="scatter", mode ="lines", name ="Seasonal", showlegend =FALSE)p4 <-plot_ly(x =~t, y =~as.numeric(dec$random), type ="scatter", mode ="lines", name ="Remainder", showlegend =FALSE)subplot(p1, p2, p3, p4, nrows =4, shareX =TRUE) |>layout(title ="Decomposition of USA Combined Sentiment (Health, General, Science)",xaxis =list(title ="Year"),yaxis =list(title ="Observed", nticks =5, tickfont =list(size =10)),yaxis2 =list(title ="Trend", nticks =5, tickfont =list(size =10)),yaxis3 =list(title ="Seasonal", nticks =3, tickformat =".3f", tickfont =list(size =10)),yaxis4 =list(title ="Remainder",nticks =3, tickformat =".3f", tickfont =list(size =10)),margin =list(l =70, r =10, t =40, b =40) )
The decomposition shows a gradual decline in overall news sentiment since 2015, reflecting increasingly neutral or negative coverage over time. Superimposed on this trend is a clear seasonal pattern, with predictable dips early in the year, spikes around mid-year events, smaller mid-year peaks, and another year-end spike likely tied to holidays or major announcements. These seasonal oscillations highlight the recurring rhythm of news sentiment in response to cyclical events.
# Decomposition of combined bias time series in the USA# Prepare monthly aggregated bias data for USA (health, general, science combined)usa_combined_bias_monthly <- df_countries %>%filter(location =="United States", category %in%c("health", "general", "science")) %>%mutate(year_month =floor_date(publishedAt, "month")) %>%group_by(year_month) %>%summarise(avg_bias =mean(bias_score, na.rm =TRUE), .groups ='drop') %>%arrange(year_month)# Create time series object with monthly frequencycombined_bias_ts <-ts(usa_combined_bias_monthly$avg_bias, start =c(2018, 1), frequency =12)# Decompose (additive by default)dec <-decompose(combined_bias_ts)
# Time indext <-time(combined_bias_ts)# Make four simple plotsp1 <-plot_ly(x =~t, y =~as.numeric(dec$x), type ="scatter", mode ="lines", name ="Observed", showlegend =FALSE)p2 <-plot_ly(x =~t, y =~as.numeric(dec$trend), type ="scatter", mode ="lines", name ="Trend", showlegend =FALSE)p3 <-plot_ly(x =~t, y =~as.numeric(dec$seasonal), type ="scatter", mode ="lines", name ="Seasonal", showlegend =FALSE)p4 <-plot_ly(x =~t, y =~as.numeric(dec$random), type ="scatter", mode ="lines", name ="Remainder", showlegend =FALSE)subplot(p1, p2, p3, p4, nrows =4, shareX =TRUE) |>layout(title ="Decomposition of USA Combined Bias (Health, General, Science)",xaxis =list(title ="Year"),yaxis =list(title ="Observed", nticks =5, tickfont =list(size =10)),yaxis2 =list(title ="Trend", nticks =5, tickfont =list(size =10)),yaxis3 =list(title ="Seasonal", nticks =3, tickformat =".3f", tickfont =list(size =10)),yaxis4 =list(title ="Remainder",nticks =3, tickformat =".3f", tickfont =list(size =10)),margin =list(l =70, r =10, t =40, b =40) )
# Create US data frame with publishedAt, category, year, month, avg_compound, and avg_bias# Create aggregated US data frame by date and categoryus_daily_combinedC_df <- df_countries %>%filter(location =="United States", category %in%c("health", "general", "science")) %>%group_by(publishedAt, category) %>%summarise(avg_compound =mean(compound, na.rm =TRUE),avg_bias =mean(bias_score, na.rm =TRUE),n_articles =n(),.groups ='drop' ) %>%mutate(year =year(publishedAt),month =month(publishedAt) ) %>%select(publishedAt, category, year, month, avg_compound, avg_bias, n_articles) %>%arrange(publishedAt, category)# Display first few rows and summaryhead(us_daily_combinedC_df, 10)
# Extract day from date e.g., 1 for Sunday, 2 for Monday, etc.# Add day of week information to the US daily dataframeus_daily_combinedC_df <- us_daily_combinedC_df %>%mutate(# wday() returns 1 for Sunday, 2 for Monday, ..., 7 for Saturdayday_of_week =wday(publishedAt),# wday() with label=TRUE, abbr=FALSE returns full day names day_full_name =wday(publishedAt, label =TRUE, abbr =FALSE) )# Display updated dataframe with day informationhead(us_daily_combinedC_df, 10)
The linear regression shows that headline category is the strongest predictor of daily average sentiment: science news is associated with the highest positive shift, followed by health news. Bias has a small positive effect on sentiment, though only marginally significant. Day-of-week effects are negligible. Overall, the model explains about 14% of the variation in daily sentiment, indicating that much of sentiment variation is driven by factors not captured in this model
# Convert residuals to time series component and look at the residuals ACF# Extract residuals from the regression modelresiduals_data <-residuals(us_sentiment_combinedC_model)# Create a dataframe with residuals and dates for proper orderingresiduals_df <-data.frame(residuals = residuals_data,publishedAt = us_daily_combinedC_df$publishedAt,category = us_daily_combinedC_df$category) %>%arrange(publishedAt)# Convert residuals to time series object# Note: Since we have irregular time series (not every day has data for every category),# we'll work with the residuals as a vector for ACF analysisres_fit <- residuals_df$residuals# Create ACF plot for residualsacf_plot <-ggAcf(res_fit, lag.max =40) +labs(title ="Autocorrelation Function (ACF) of Regression Residuals",subtitle ="US Daily News Sentiment Model Residuals",x ="Lag",y ="ACF" ) +theme_minimal()print(acf_plot)
The ACF plot of regression residuals shows low autocorrelation, with only three minor spikes outside the confidence bounds: at lag 2, and around lags 30–35. This indicates that the model largely captures the systematic variation in sentiment, and the residuals behave approximately as white noise. The spikes may reflect small short-term or monthly cycles in news sentiment not fully accounted for by the model
# Lag plot for sentiment in USA# Create a time series of daily average sentiment for USAusa_sentiment_combinedC_daily <- us_daily_combinedC_df %>%group_by(publishedAt) %>%summarise(daily_avg_sentiment =mean(avg_compound, na.rm =TRUE),.groups ='drop' ) %>%arrange(publishedAt) %>%filter(!is.na(daily_avg_sentiment))# Convert to time series objectusa_sentiment_combinedC.ts <-ts(usa_sentiment_combinedC_daily$daily_avg_sentiment, frequency =365)# Create lag plots with 4 different lagsgglagplot(usa_sentiment_combinedC.ts, do.lines =FALSE, lags =4) +labs(title ="Lag Plots: USA Daily Sentiment Analysis",subtitle ="Autocorrelation patterns in daily sentiment data",x ="Y(t-k)",y ="Y(t)" ) +theme_minimal() +theme(strip.text =element_text(face ="bold"),plot.title =element_text(size =14, face ="bold"),plot.subtitle =element_text(size =12) )
The lag plot of daily sentiment shows points scattered around zero, forming a roughly circular pattern. This indicates that consecutive daily sentiment values are largely independent, with no strong linear autocorrelation. The circular scatter may hint at mild non-linear dependencies or recurring cycles, but overall, sentiment appears mostly driven by individual events rather than day-to-day persistence.
# Lag plot for bias in USA# Create a time series of daily average bias for USAusa_bias_combinedC_daily <- us_daily_combinedC_df %>%group_by(publishedAt) %>%summarise(daily_avg_bias =mean(avg_bias, na.rm =TRUE),.groups ='drop' ) %>%arrange(publishedAt) %>%filter(!is.na(daily_avg_bias))# Convert to time series objectusa_bias_combinedC.ts <-ts(usa_bias_combinedC_daily$daily_avg_bias, frequency =365)# Create lag plots with 4 different lagsgglagplot(usa_bias_combinedC.ts, do.lines =FALSE, lags =4) +labs(title ="Lag Plots: USA Daily Bias Analysis",subtitle ="Autocorrelation patterns in daily bias data",x ="Y(t-k)",y ="Y(t)" ) +theme_minimal() +theme(strip.text =element_text(face ="bold"),plot.title =element_text(size =14, face ="bold"),plot.subtitle =element_text(size =12) )
The lag plot of daily bias shows points scattered around zero, forming a roughly circular pattern. This indicates that consecutive daily sentiment values are largely independent, with no strong linear autocorrelation. The circular scatter may hint at mild non-linear dependencies or recurring cycles, but overall, sentiment appears mostly driven by individual events rather than day-to-day persistence.
# ACF plot for USA daily sentimentggAcf(usa_sentiment_combinedC_daily$daily_avg_sentiment, lag.max =40) +labs(title ="Autocorrelation Function (ACF): USA Daily Sentiment",subtitle ="Identifying temporal dependencies in daily sentiment data",x ="Lag (days)",y ="ACF" ) +theme_minimal() +theme(plot.title =element_text(size =14, face ="bold"),plot.subtitle =element_text(size =12) )
The ACF of the daily sentiment series (lags 1–40) shows that most autocorrelations fall within the confidence bounds, indicating weak temporal dependence. Only five lags show significant autocorrelation, suggesting minor short-term or periodic effects in news sentiment. Overall, sentiment appears largely event-driven rather than strongly influenced by preceding days.
While the sentiment series exhibits weak autocorrelation, the presence of seasonal cycles and minor short-term dependencies justifies exploring time series models, particularly ones that can incorporate exogenous event-related variables. Pure autoregressive models may have limited predictive power due to the event-driven nature of sentiment, but a seasonal or dynamic regression framework could improve forecasting and provide insight into temporal patterns
# PACF plot for USA daily sentimentggPacf(usa_sentiment_combinedC_daily$daily_avg_sentiment, lag.max =40) +labs(title ="Partial Autocorrelation Function (PACF): USA Daily Sentiment",subtitle ="Identifying direct temporal relationships in daily sentiment data",x ="Lag (days)",y ="PACF" ) +theme_minimal() +theme(plot.title =element_text(size =14, face ="bold"),plot.subtitle =element_text(size =12) )
The PACF of daily sentiment shows only four lags with significant partial autocorrelation, indicating that short-term dependencies exist for a few immediate days, but beyond that, sentiment behaves largely independently. This suggests that if a time series model is applied, only a low-order autoregressive component is needed, and incorporating seasonality or event-based regressors would likely provide more explanatory power.
Analysis of top sources for health and science
# View main sources of news for science and health category in the USA# Analyze top news sources for health and science categoriesusa_sources_health_science <- df_countries %>%filter(location =="United States", category %in%c("health", "science")) %>%group_by(category, `source-name`) %>%summarise(n_articles =n(),avg_sentiment =mean(compound, na.rm =TRUE),avg_bias =mean(bias_score, na.rm =TRUE),.groups ='drop' ) %>%arrange(category, desc(n_articles))# Get top 10 sources for each categorytop_sources_health <- usa_sources_health_science %>%filter(category =="health") %>%slice_head(n =10)top_sources_science <- usa_sources_health_science %>%filter(category =="science") %>%slice_head(n =10)# Display the resultscat("Top 10 Health News Sources in USA (2020-2021):\n")
Top 10 Health News Sources in USA (2020-2021):
print(top_sources_health)
# A tibble: 10 × 5
category `source-name` n_articles avg_sentiment avg_bias
<chr> <chr> <int> <dbl> <dbl>
1 health Google News 632 -0.0422 0.110
2 health Yahoo Entertainment 630 -0.0427 0.135
3 health CNN 426 -0.103 0.182
4 health New York Times 357 -0.0601 0.586
5 health Fox News 344 -0.0605 -0.0684
6 health Eatthis.com 189 -0.0302 -0.0889
7 health USA Today 188 -0.0689 0.0879
8 health New York Post 170 -0.167 0.235
9 health Lost Coast Outpost 168 -0.316 -0.706
10 health NPR 163 -0.0767 0.0140
cat("\nTop 10 Science News Sources in USA (2020-2021):\n")
Top 10 Science News Sources in USA (2020-2021):
print(top_sources_science)
# A tibble: 10 × 5
category `source-name` n_articles avg_sentiment avg_bias
<chr> <chr> <int> <dbl> <dbl>
1 science Phys.Org 193 0.0702 0.519
2 science CNN 160 0.0135 0.296
3 science Yahoo Entertainment 124 0.0944 0.314
4 science Space.com 119 0.0488 0.329
5 science SciTechDaily 112 0.115 0.428
6 science New York Times 93 -0.0236 0.816
7 science NASA 81 0.0905 0.254
8 science Live Science 80 -0.0307 0.147
9 science Fox News 79 0.0239 0.214
10 science Business Insider 72 -0.0254 0.109
# Create visualization for top news sources by article count# Health News Sources Visualizationdf_usa_health <- df_countries_2020 %>%filter(location =="United States", category =="health")# Top 20 health news sourcestop_health_sources <- df_usa_health %>%count(`source-name`, sort =TRUE) %>%slice_head(n =20)# Horizontal bar chart for health sourceshealth_sources_plot <-plot_ly(data = top_health_sources,x =~n,y =~reorder(`source-name`, n),type ='bar',orientation ='h',marker =list(color ='#E74C3C'),name ='Health Sources') %>%layout(title =list(text ='Top 20 USA Health News Sources by Article Count (2020-2021)',font =list(family ="Times New Roman") ),xaxis =list(title ='Article Count'),yaxis =list(title ='News Source'),margin =list(l =150) )health_sources_plot
# Average Sentiment for top 5 health sources# Get top 5 health sources for sentiment analysistop_5_health_sentiment <- top_sources_health %>%slice_head(n =5)# Create vertical bar chart for health sentimenthealth_sentiment_plot <-plot_ly(data = top_5_health_sentiment,x =~reorder(`source-name`, avg_sentiment),y =~avg_sentiment,type ='bar',marker =list(color ='#27AE60'),name ='Health Sentiment',text =~paste("Articles:", n_articles, "<br>Sentiment:", round(avg_sentiment, 3)),textposition ='auto') %>%layout(title =list(text ='Average Sentiment for Top 5 USA Health News Sources (2020-2021)',font =list(family ="Times New Roman") ),xaxis =list(title ='News Source'),yaxis =list(title ='Average Sentiment Score'),margin =list(b =100) )health_sentiment_plot
# Average Bias for top 5 health sources# Get top 5 health sources for bias analysistop_5_health_bias <- top_sources_health %>%slice_head(n =5)# Create vertical bar chart for health biashealth_bias_plot <-plot_ly(data = top_5_health_bias,x =~reorder(`source-name`, avg_bias),y =~avg_bias,type ='bar',marker =list(color ='#E67E22'),name ='Health Bias',text =~paste("Articles:", n_articles, "<br>Bias:", round(avg_bias, 3)),textposition ='auto') %>%layout(title =list(text ='Average Bias for Top 5 USA Health News Sources (2020-2021)',font =list(family ="Times New Roman") ),xaxis =list(title ='News Source'),yaxis =list(title ='Average Bias Score'),margin =list(b =100) )health_bias_plot
# Average Sentiment for top 5 science sources# Get top 5 science sources for sentiment analysistop_5_science_sentiment <- top_sources_science %>%slice_head(n =5)# Create vertical bar chart for science sentimentscience_sentiment_plot <-plot_ly(data = top_5_science_sentiment,x =~reorder(`source-name`, avg_sentiment),y =~avg_sentiment,type ='bar',marker =list(color ='#8E44AD'),name ='Science Sentiment',text =~paste("Articles:", n_articles, "<br>Sentiment:", round(avg_sentiment, 3)),textposition ='auto') %>%layout(title =list(text ='Average Sentiment for Top 5 USA Science News Sources (2020-2021)',font =list(family ="Times New Roman") ),xaxis =list(title ='News Source'),yaxis =list(title ='Average Sentiment Score'),margin =list(b =100) )science_sentiment_plot
# Average Bias for top 5 science sources# Get top 5 science sources for bias analysistop_5_science_bias <- top_sources_science %>%slice_head(n =5)# Create vertical bar chart for science biasscience_bias_plot <-plot_ly(data = top_5_science_bias,x =~reorder(`source-name`, avg_bias),y =~avg_bias,type ='bar',marker =list(color ='#E74C3C'),name ='Science Bias',text =~paste("Articles:", n_articles, "<br>Bias:", round(avg_bias, 3)),textposition ='auto') %>%layout(title =list(text ='Average Bias for Top 5 USA Science News Sources (2020-2021)',font =list(family ="Times New Roman") ),xaxis =list(title ='News Source'),yaxis =list(title ='Average Bias Score'),margin =list(b =100) )science_bias_plot
Deep Dive into Health and Science trends during the pandemic
# Combined time series plot for sentiment and bias in the health category, daily# Prepare daily health data for USA with both sentiment and biasusa_health_daily <- df_countries %>%filter(location =="United States", category =="health") %>%group_by(publishedAt) %>%summarise(avg_sentiment =mean(compound, na.rm =TRUE),avg_bias =mean(bias_score, na.rm =TRUE),n_articles =n(),.groups ='drop' ) %>%rename(date = publishedAt) %>%# Transform to long format for plottingpivot_longer(cols =c(avg_sentiment, avg_bias),names_to ="metric",values_to ="score" ) %>%mutate(metric =case_when( metric =="avg_sentiment"~"Sentiment", metric =="avg_bias"~"Bias" ) )head(usa_health_daily)
# Create combined plot with dual y-axes# Create the base plot with sentimentusa_health_combined_plot <-ggplot(usa_health_daily, aes(x = date, y = score, color = metric)) +geom_line(size =0.5, alpha =0.7) +facet_wrap(~ metric, scales ="free_y", ncol =1) +labs(title ="Daily Sentiment and Bias Trends for USA Health News",subtitle ="From 2020 - 2021 (Pandemic Period)",x ="Date",y ="Score",color ="Metric" ) +theme_minimal() +theme(legend.position ="bottom",axis.text.x =element_text(angle =45, hjust =1),strip.text =element_text(face ="bold") ) +scale_x_date(limits =c(as.Date("2020-01-01"), as.Date("2021-12-31")),date_breaks ="3 months", date_labels ="%Y-%m" ) +scale_color_manual(values =c("Sentiment"="#2E86AB", "Bias"="#A23B72"))ggplotly(usa_health_combined_plot) %>%layout(hovermode ="x")
# Decomposition of health sentiment time series in the USA# Prepare monthly aggregated health sentiment data for USAusa_health_monthly <- df_countries %>%filter(location =="United States", category =="health") %>%mutate(year_month =floor_date(publishedAt, "month")) %>%group_by(year_month) %>%summarise(avg_sentiment =mean(compound, na.rm =TRUE), .groups ='drop') %>%arrange(year_month)# Create time series object with monthly frequencyhealth_ts <-ts(usa_health_monthly$avg_sentiment, start =c(2018, 1), frequency =12)# Decompose (additive by default)dec <-decompose(health_ts)
# Time indext <-time(health_ts)# Make four simple plotsp1 <-plot_ly(x =~t, y =~as.numeric(dec$x), type ="scatter", mode ="lines", name ="Observed", showlegend =FALSE)p2 <-plot_ly(x =~t, y =~as.numeric(dec$trend), type ="scatter", mode ="lines", name ="Trend", showlegend =FALSE)p3 <-plot_ly(x =~t, y =~as.numeric(dec$seasonal), type ="scatter", mode ="lines", name ="Seasonal", showlegend =FALSE)p4 <-plot_ly(x =~t, y =~as.numeric(dec$random), type ="scatter", mode ="lines", name ="Remainder", showlegend =FALSE)subplot(p1, p2, p3, p4, nrows =4, shareX =TRUE) |>layout(title ="Decomposition of USA Health Sentiment (2020-2021)",xaxis =list(title ="Year"),yaxis =list(title ="Observed", nticks =5, tickfont =list(size =10)),yaxis2 =list(title ="Trend", nticks =5, tickfont =list(size =10)),yaxis3 =list(title ="Seasonal", nticks =3, tickformat =".3f", tickfont =list(size =10)),yaxis4 =list(title ="Remainder",nticks =3, tickformat =".3f", tickfont =list(size =10)),margin =list(l =70, r =10, t =40, b =40) )
# Decomposition of health bias time series in the USA# Prepare monthly aggregated health bias data for USAusa_health_bias_monthly <- df_countries %>%filter(location =="United States", category =="health") %>%mutate(year_month =floor_date(publishedAt, "month")) %>%group_by(year_month) %>%summarise(avg_bias =mean(bias_score, na.rm =TRUE), .groups ='drop') %>%arrange(year_month)# Create time series object with monthly frequencybias_ts <-ts(usa_health_bias_monthly$avg_bias, start =c(2018, 1), frequency =12)# Decompose (additive by default)dec <-decompose(bias_ts)
# Time indext <-time(bias_ts)# Make four simple plotsp1 <-plot_ly(x =~t, y =~as.numeric(dec$x), type ="scatter", mode ="lines", name ="Observed", showlegend =FALSE)p2 <-plot_ly(x =~t, y =~as.numeric(dec$trend), type ="scatter", mode ="lines", name ="Trend", showlegend =FALSE)p3 <-plot_ly(x =~t, y =~as.numeric(dec$seasonal), type ="scatter", mode ="lines", name ="Seasonal", showlegend =FALSE)p4 <-plot_ly(x =~t, y =~as.numeric(dec$random), type ="scatter", mode ="lines", name ="Remainder", showlegend =FALSE)subplot(p1, p2, p3, p4, nrows =4, shareX =TRUE) |>layout(title ="Decomposition of USA Health bias (2020-2021)",xaxis =list(title ="Year"),yaxis =list(title ="Observed", nticks =5, tickfont =list(size =10)),yaxis2 =list(title ="Trend", nticks =5, tickfont =list(size =10)),yaxis3 =list(title ="Seasonal", nticks =3, tickformat =".3f", tickfont =list(size =10)),yaxis4 =list(title ="Remainder",nticks =3, tickformat =".3f", tickfont =list(size =10)),margin =list(l =70, r =10, t =40, b =40) )
Similar decomposition for science category
# Decomposition of science sentiment time series in the USA# Prepare monthly aggregated science sentiment data for USAusa_science_monthly <- df_countries %>%filter(location =="United States", category =="science") %>%mutate(year_month =floor_date(publishedAt, "month")) %>%group_by(year_month) %>%summarise(avg_sentiment =mean(compound, na.rm =TRUE), .groups ='drop') %>%arrange(year_month)# Create time series object with monthly frequencyscience_ts <-ts(usa_science_monthly$avg_sentiment, start =c(2018, 1), frequency =12)# Decompose (additive by default)dec <-decompose(science_ts)
# Time indext <-time(science_ts)# Make four simple plotsp1 <-plot_ly(x =~t, y =~as.numeric(dec$x), type ="scatter", mode ="lines", name ="Observed", showlegend =FALSE)p2 <-plot_ly(x =~t, y =~as.numeric(dec$trend), type ="scatter", mode ="lines", name ="Trend", showlegend =FALSE)p3 <-plot_ly(x =~t, y =~as.numeric(dec$seasonal), type ="scatter", mode ="lines", name ="Seasonal", showlegend =FALSE)p4 <-plot_ly(x =~t, y =~as.numeric(dec$random), type ="scatter", mode ="lines", name ="Remainder", showlegend =FALSE)subplot(p1, p2, p3, p4, nrows =4, shareX =TRUE) |>layout(title ="Decomposition of USA Science Sentiment (2020-2021)",xaxis =list(title ="Year"),yaxis =list(title ="Observed", nticks =5, tickfont =list(size =10)),yaxis2 =list(title ="Trend", nticks =5, tickfont =list(size =10)),yaxis3 =list(title ="Seasonal", nticks =3, tickformat =".3f", tickfont =list(size =10)),yaxis4 =list(title ="Remainder",nticks =3, tickformat =".3f", tickfont =list(size =10)),margin =list(l =70, r =10, t =40, b =40) )
# Decomposition of science bias time series in the USA# Prepare monthly aggregated science bias data for USAusa_science_bias_monthly <- df_countries %>%filter(location =="United States", category =="science") %>%mutate(year_month =floor_date(publishedAt, "month")) %>%group_by(year_month) %>%summarise(avg_bias =mean(bias_score, na.rm =TRUE), .groups ='drop') %>%arrange(year_month)# Create time series object with monthly frequencybias_ts <-ts(usa_science_bias_monthly$avg_bias, start =c(2018, 1), frequency =12)# Decompose (additive by default)dec <-decompose(bias_ts)
# Time indext <-time(bias_ts)# Make four simple plotsp1 <-plot_ly(x =~t, y =~as.numeric(dec$x), type ="scatter", mode ="lines", name ="Observed", showlegend =FALSE)p2 <-plot_ly(x =~t, y =~as.numeric(dec$trend), type ="scatter", mode ="lines", name ="Trend", showlegend =FALSE)p3 <-plot_ly(x =~t, y =~as.numeric(dec$seasonal), type ="scatter", mode ="lines", name ="Seasonal", showlegend =FALSE)p4 <-plot_ly(x =~t, y =~as.numeric(dec$random), type ="scatter", mode ="lines", name ="Remainder", showlegend =FALSE)subplot(p1, p2, p3, p4, nrows =4, shareX =TRUE) |>layout(title ="Decomposition of USA Science Bias (2020-2021)",xaxis =list(title ="Year"),yaxis =list(title ="Observed", nticks =5, tickfont =list(size =10)),yaxis2 =list(title ="Trend", nticks =5, tickfont =list(size =10)),yaxis3 =list(title ="Seasonal", nticks =3, tickformat =".3f", tickfont =list(size =10)),yaxis4 =list(title ="Remainder",nticks =3, tickformat =".3f", tickfont =list(size =10)),margin =list(l =70, r =10, t =40, b =40) )